{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 准备工作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "阿里研究院\n",
    "阿里健康\n",
    "阿里巴巴商学院\n",
    "阿里数据\n",
    "\n",
    "腾讯金融科技\n",
    "腾讯研究院\n",
    "腾讯媒体研究院\n",
    "腾讯云启研究院\n",
    "酷鹅用户研究院\n",
    "'''\n",
    "公众号 = \"吃喝玩乐IN广州\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "fn = { \"output\" : { \"公众号_htm_snippets\": \"data_raw_src/公众号_htm_snippets_{公众号}.tsv\",\n",
    "                    \"公众号_df\": \"data_raw_src/公众号_df_{公众号}.tsv\",\n",
    "                    \"公众号_xlsx\": \"data_sets/公众号_url_{公众号}.xlsx\" } \\\n",
    "      }"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 采集公众号（requests）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "5\n",
      "10\n",
      "15\n",
      "20\n"
     ]
    }
   ],
   "source": [
    "# 目标url\n",
    "\n",
    "import time\n",
    "import requests\n",
    "import pandas as pd\n",
    "import csv\n",
    "\n",
    "\n",
    "url = \"https://mp.weixin.qq.com/cgi-bin/appmsg\"\n",
    "\n",
    "# 使用Cookie，跳过登陆操作\n",
    "headers = {\n",
    "  \"Cookie\": \"pgv_pvi=5823542272; RK=5JJV+jdAMv; ptcz=957cf2056e5357f8f829ad7e2537c107c7ffbc60fdfea38d7a234c0d5d0a7649; tvfe_boss_uuid=76cabefb20b63bd2; pgv_pvid=1314110558; o_cookie=2732126988; _ga=GA1.2.774548378.1584018386; Qs_lvt_323937=1584018380%2C1584018551%2C1585640994; Qs_pv_323937=1766366837948888300%2C1046231348948636800%2C2612472087742072000%2C3769987254753993000; ua_id=SiOebkqqOEvPtzs6AAAAADNbe1gAD5pXqorRZD_8m_Y=; pgv_si=s5084417024; uuid=0db995418e1672ce2de10806109e44da; rand_info=CAESIK0RC96EZh0mN98bWi6lnW6xeONxkugF+Wnbg2ZPtEHb; slave_bizuin=3227791066; data_bizuin=3227791066; bizuin=3227791066; data_ticket=Mmk4JNuso7/SNORqdrDRwQo+MeSYgaa8NEEGVeHLQm/Z1Rimkkkmh5oDNGMzh0dk; slave_sid=QmVMRmlZNmlZajFYQTJ0d2RLZngxTm1JN3NaN0ZuZXBDNklXMDZpNmQ0QTBfWHlNVVBEV1N6Skx4VVp5d3pNZUhxT3J4YUNnbTdJMHFmRnY0NjdwdDdCNnUyNXJDd3MzbWZuN3BpV3dyQ2FSb1lIUWdmUGZ4aU1aZTM3VktNSlVnM0d6amsxcDRyY3VLamlo; slave_user=gh_aca8db644678; xid=6a871694577a97af4694edb6e1d080b9; openid2ticket_o2i5kwOnsNcs4-fAB8m1OhsNJEr8=QV+/LZzbN6Ysn8G5uxAOvlMbWY9j+E3IMv0/BTY70xA=; mm_lang=zh_CN\",\n",
    "  \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36\"}\n",
    "\n",
    "data = {\n",
    "    \"token\": \"427835617\",\n",
    "    \"lang\": \"zh_CN\",\n",
    "    \"f\": \"json\",\n",
    "    \"ajax\": \"1\",\n",
    "    \"action\": \"list_ex\",\n",
    "    \"begin\": \"0\",\n",
    "    \"count\": \"5\",\n",
    "    \"query\": \"\",\n",
    "    \"fakeid\": \"MjM5MjMzMjY2MA==\",\n",
    "    \"type\": \"9\",\n",
    "}\n",
    "\n",
    "\n",
    "\n",
    "content_list=[]\n",
    "\n",
    "for i in range(5):\n",
    "    data[\"begin\"] = i*5\n",
    "    print(data[\"begin\"])\n",
    "    time.sleep(3)\n",
    "    # 使用get方法进行提交\n",
    "    content_json = requests.get(url, headers=headers, params=data).json()\n",
    "#     print(content_json)\n",
    "    # 返回了一个json，里面是每一页的数据\n",
    "    for item in content_json[\"app_msg_list\"]:\n",
    "    # 提取每页文章的标题及对应的url\n",
    "        items = []\n",
    "        items.append(item[\"title\"])\n",
    "        items.append(item[\"link\"])\n",
    "        items.append(item[\"create_time\"])\n",
    "        content_list.append(items)\n",
    "\n",
    "\n",
    "name=['title','link','create_time']\n",
    "test=pd.DataFrame(columns=name,data=content_list)\n",
    "with pd.ExcelWriter(fn[\"output\"][\"公众号_xlsx\"].format(公众号=\"吃喝玩乐IN广州_requests\")) as writer:\n",
    "    test.to_excel(writer)\n",
    "\n",
    "# test.to_csv(\"../微信公众号爬虫_zhichao/南方周末.csv\",mode='a',encoding='utf-8')\n",
    "# print(\"保存成功\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 采集公众号（selenium）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random\n",
    "\n",
    "# when selenium main_content is used\n",
    "# Parses an HTML document from a string constant.  Returns the root nood\n",
    "# root = fromstring(df.loc[1,\"html_snippets\"]) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 使用Selenium\n",
    "* 要更改 opts.binary_location 至自己本地的Chrome浏览器，建议portable\n",
    "* Chrome浏览器 和 chromedriver.exe要同版本号到小数后一位\n",
    "* 要确保可以 开启浏览器机器人\n",
    "* 要确保浏览器机器人 可以打开网页 driver.get(\"https://mp.weixin.qq.com\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-10-ce3635f7b200>:19: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--disable-dev-shm-usage')\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "\n",
    "opts.binary_location = r\"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "# \"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://mp.weixin.qq.com\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 填表登入"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "selenium 的定位方法\n",
    "* find_element_by_id &ensp;&ensp;&ensp;  根据标签id定位\n",
    "* find_element_by_name   &ensp;&ensp;&ensp; 根据标签的name定位\n",
    "* find_element_by_xpath  &ensp;&ensp;&ensp; 根据xpath定位\n",
    "* find_element_by_link_text  &ensp;&ensp;&ensp; 通过文字链接来定位元素\n",
    "* find_element_by_partial_link_text  &ensp;&ensp;&ensp;  通过文字链接来定位元素\n",
    "* find_element_by_tag_name  &ensp;&ensp;&ensp;  根据标签的名字定位\n",
    "* find_element_by_class_name  &ensp;&ensp;&ensp; 通过class name 定位\n",
    "* find_element_by_css_selector  &ensp;&ensp;&ensp;  根据元素属性来定位"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "payload =  {\"account\": \"carrin328@163.com\", \"password\": \"carrinmima123\"}\n",
    "# payload =  {\"account\": \"NFUHacks@163.com\", \"password\": \"NFU706947580\"}\n",
    "driver.find_element_by_xpath('//div[@class=\"login__type__container login__type__container__scan\"]/a').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "WebDriver 常用方法：\n",
    "* clear()清楚文本\n",
    "* send_keys(values)模拟按键输入\n",
    "* click()模拟点击\n",
    "* submit模拟提交"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"account\"]').clear()\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"account\"]').send_keys(payload['account'])\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').clear()\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').send_keys(payload['password'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//div[@class=\"login_btn_panel\"]/a').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 点选单"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "其他常用方法\n",
    "* size：返回元素的尺寸\n",
    "* text：获取元素的文本\n",
    "* get_attribute：获取属性值  &ensp;&ensp;&ensp; get_attribute('innerHTML')获取元素内的全部HTML\n",
    "* is_displayed()：设置该元素用户是否可见"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'展开'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//a[@id=\"m_open\"]')\n",
    "element.click()\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.execute_script(\"window.scrollTo(0,document.body.scrollHeight)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://mp.weixin.qq.com/cgi-bin/appmsg?begin=0&count=10&t=media/appmsg_list&type=10&action=list&token=672223852&lang=zh_CN'"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//li[@title[contains(.,\"素材管理\")]]/a') \n",
    "# main_content = element.get_attribute('innerHTML')\n",
    "# main_content\n",
    "url_素材管理= element.get_attribute(\"href\")\n",
    "url_素材管理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(url_素材管理)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 新建图文消息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//*[text()[contains(.,\"新建图文消息\")]]') \n",
    "main_content = element.get_attribute('innerHTML')\n",
    "main_content\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['CDwindow-69770DC078B3A9FB3CAAEB230211F0F4', 'CDwindow-021D3D4CD43400ED3FAC532EAFD59685']\n"
     ]
    }
   ],
   "source": [
    "print (driver.window_handles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 新建图文消息开了另一分视窗，所以要切换 switch_to \n",
    "driver.switch_to.window(driver.window_handles[-1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 超链接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                超链接              \n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//*[text()[contains(.,\"超链接\")]]') \n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "选择其他公众号\n"
     ]
    }
   ],
   "source": [
    "# 点 选择其他公众号\n",
    "element = driver.find_element_by_xpath('//*[text()[contains(.,\"选择其他公众号\")]]') \n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form//div[@class=\"inner_link_account_area\"]//input[@class=\"weui-desktop-form__input\"]').clear()\n",
    "driver.find_element_by_xpath('//form//div[@class=\"inner_link_account_area\"]//input[@class=\"weui-desktop-form__input\"]').send_keys(公众号)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-icon weui-desktop-icon__inputSearch weui-desktop-icon__small\"><!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <svg width=\"16\" height=\"16\" viewBox=\"0 0 16 16\" xmlns=\"http://www.w3.org/2000/svg\"><path d=\"M11.33 10.007l4.273 4.273a.502.502 0 0 1 .005.709l-.585.584a.499.499 0 0 1-.709-.004L10.046 11.3a6.278 6.278 0 1 1 1.284-1.294zm.012-3.729a5.063 5.063 0 1 0-10.127 0 5.063 5.063 0 0 0 10.127 0z\"></path></svg> <!----> <!----> <!----> <!----></div>\n"
     ]
    }
   ],
   "source": [
    "# 点放大镜搜\n",
    "element = driver.find_element_by_xpath('//button[@class=\"weui-desktop-icon-btn weui-desktop-search__btn\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/fneuMdl9tYL3JkhQUXhLpkZf0FicZAAcCvdajchNEGRvoZCSxGyD1GEEiaB1ppCAACI1GqZHDDiaxyRxkIfvC6Q3w/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">吃喝玩乐IN广州</strong> <i class=\"inner_link_account_wechat\">微信号：gzlifes</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/CbiahvU88bqrF0bw6qMxmFCHuZfaIQnowiaBfZVCdIHlWicLR1NRs2eOYrNdyHC3tMJLzbXw4KhSzj4FsS3XIptLQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐</strong> <i class=\"inner_link_account_wechat\">微信号：gz0020</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/uAB6jNicic81Izudvsga0MiaicEXnFhxx1UKzPqpGlxibvT443lq9qxEEHBia9Xol0kJEUSdDjjQ7ajk7y0SSBV4ka3w/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">吃喝玩乐IN广州</strong> <i class=\"inner_link_account_wechat\">微信号：gzlifes11</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/Vich0ibbIQ7le3mbfz9BRLZsZkhksIWm64llNSkcd2btkiclPMtvMmvb0fqAAQBpdE3XibHTpKCM1DJuOkic8YibibJ2g/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">吃喝玩乐IN广州</strong> <i class=\"inner_link_account_wechat\">微信号：chwlinfz</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/dtPGfkhUfVcpqzgUjp5VXgK99Hlaic5LibhOgs5RyTakSXufESZsqfw7rpZemxerBqrMO1rWrYPjcoj7k40LbyYw/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐IN白云</strong> <i class=\"inner_link_account_wechat\">微信号：GZbaiyun33</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "公众号SERP = main_content\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 解析\n",
    "root = fromstring(公众号SERP) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "主 = root.xpath('//li[@class=\"inner_link_account_item\"]')\n",
    "\n",
    "account_list = []\n",
    "for e in 主:\n",
    "    account_nickname = e.xpath('./div/strong[@class=\"inner_link_account_nickname\"]')[0].text\n",
    "    account_wechat = e.xpath('./div/i[@class=\"inner_link_account_wechat\"]')[0].text\n",
    "    account_img = e.xpath('./div/img/@src')[0]\n",
    "    account = {\"nickname\": account_nickname, \"wechat\": account_wechat, \"img\": account_img,}\n",
    "    account_list.append(account)\n",
    "\n",
    "df_account = pd.DataFrame(account_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nickname</th>\n",
       "      <th>wechat</th>\n",
       "      <th>img</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>吃喝玩乐IN广州</td>\n",
       "      <td>微信号：gzlifes</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/fneuMdl9tYL3Jkh...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>广州吃喝玩乐</td>\n",
       "      <td>微信号：gz0020</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/CbiahvU88bqrF0b...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>吃喝玩乐IN广州</td>\n",
       "      <td>微信号：gzlifes11</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/uAB6jNicic81Izu...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>吃喝玩乐IN广州</td>\n",
       "      <td>微信号：chwlinfz</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/Vich0ibbIQ7le3m...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>广州吃喝玩乐IN白云</td>\n",
       "      <td>微信号：GZbaiyun33</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/dtPGfkhUfVcpqzg...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     nickname          wechat  \\\n",
       "0    吃喝玩乐IN广州     微信号：gzlifes   \n",
       "1      广州吃喝玩乐      微信号：gz0020   \n",
       "2    吃喝玩乐IN广州   微信号：gzlifes11   \n",
       "3    吃喝玩乐IN广州    微信号：chwlinfz   \n",
       "4  广州吃喝玩乐IN白云  微信号：GZbaiyun33   \n",
       "\n",
       "                                                 img  \n",
       "0  http://mmbiz.qpic.cn/mmbiz_png/fneuMdl9tYL3Jkh...  \n",
       "1  http://mmbiz.qpic.cn/mmbiz_png/CbiahvU88bqrF0b...  \n",
       "2  http://mmbiz.qpic.cn/mmbiz_png/uAB6jNicic81Izu...  \n",
       "3  http://mmbiz.qpic.cn/mmbiz_png/Vich0ibbIQ7le3m...  \n",
       "4  http://mmbiz.qpic.cn/mmbiz_png/dtPGfkhUfVcpqzg...  "
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_account"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/fneuMdl9tYL3JkhQUXhLpkZf0FicZAAcCvdajchNEGRvoZCSxGyD1GEEiaB1ppCAACI1GqZHDDiaxyRxkIfvC6Q3w/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">吃喝玩乐IN广州</strong> <i class=\"inner_link_account_wechat\">微信号：gzlifes</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]/li')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n跳转_input = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/input\\')\\n跳转_a = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/a\\')\\n跳转_input.clear()\\n跳转_input.send_keys(2)\\n跳转_a.click()\\n'"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 跳转testing\n",
    "'''\n",
    "跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "跳转_input.clear()\n",
    "跳转_input.send_keys(2)\n",
    "跳转_a.click()\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 485]\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "# 跳转上限\n",
    "l_e = driver.find_elements_by_xpath('//label[@class=\"weui-desktop-pagination__num\"]')\n",
    "l_e_int  = [int(x.text) for x in l_e] \n",
    "print (l_e_int)\n",
    "print (l_e_int[0]==l_e_int[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(l_e_int[0],l_e_int[-1]+1 ))\n",
    "#print(pages[0:2])\n",
    "pages = list(range(1,l_e_int[-1]+1 ))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 循环/遍历"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# global varialbes \n",
    "html_raw = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "\n",
    "        跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "        跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "        跳转_input.clear()\n",
    "        跳转_input.send_keys(p)\n",
    "        跳转_a.click()\n",
    "\n",
    "        time.sleep(1)\n",
    "\n",
    "        element = driver.find_element_by_xpath('//div[@class=\"inner_link_article_list\"]')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        #print(main_content)\n",
    "        html_raw[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t21\t22\t23\t24\t25\t26\t27\t28\t29\t30\t31\t32\t33\t34\t35\t36\t37\t38\t39\t40\t41\t42\t43\t44\t45\t46\t47\t48\t49\t50\t51\t52\t53\t54\t55\t56\t57\t58\t59\t60\t61\t62\t63\t64\t65\t66\t67\t68\t69\t70\t71\t72\t73\t74\t75\t76\t77\t78\t79\t80\t81\t82\t83\t84\t85\t86\t87\t88\t89\t90\t91\t92\t93\t94\t95\t96\t97\t98\t99\t100\t101\t102\t103\t104\t105\t106\t107\t108\t109\t110\t111\t112\t113\t114\t115\t116\t117\t118\t119\t120\t121\t122\t123\t124\t125\t126\t127\t128\t129\t130\t131\t132\t133\t134\t135\t136\t137\t138\t139\t140\t141\t142\t143\t144\t145\t146\t147\t148\t149\t150\t151\t152\t153\t154\t155\t156\t157\t158\t159\t160\t161\t162\t163\t164\t165\t166\t167\t168\t169\t170\t171\t172\t173\t174\t175\t176\t177\t178\t179\t180\t181\t182\t183\t184\t185\t186\t187\t188\t189\t190\t191\t192\t193\t194\t195\t196\t197\t198\t199\t200\t201\t202\t203\t204\t205\t206\t207\t208\t209\t210\t211\t212\t213\t214\t215\t216\t217\t218\t219\t220\t221\t222\t223\t224\t225\t226\t227\t228\t229\t230\t231\t232\t233\t234\t235\t236\t237\t238\t239\t240\t241\t242\t243\t244\t245\t246\t247\t248\t249\t250\t251\t252\t253\t254\t255\t256\t257\t258\t259\t260\t261\t262\t263\t264\t265\t266\t267\t268\t269\t270\t271\t272\t273\t274\t275\t276\t277\t278\t279\t280\t281\t282\t283\t284\t285\t286\t287\t288\t289\t290\t291\t292\t293\t294\t295\t296\t297\t298\t299\t300\t301\t302\t303\t304\t305\t306\t307\t308\t309\t310\t311\t312\t313\t314\t315\t316\t317\t318\t319\t320\t321\t322\t323\t324\t325\t326\t327\t328\t329\t330\t331\t332\t333\t334\t335\t336\t337\t338\t339\t340\t341\t342\t343\t344\t345\t346\t347\t348\t349\t350\t351\t352\t353\t354\t355\t356\t357\t358\t359\t360\t361\t362\t363\t364\t365\t366\t367\t368\t369\t370\t371\t372\t373\t374\t375\t376\t377\t378\t379\t380\t381\t382\t383\t384\t385\t386\t387\t388\t389\t390\t391\t392\t393\t394\t395\t396\t397\t398\t399\t400\t401\t402\t403\t404\t405\t406\t407\t408\t409\t410\t411\t412\t413\t414\t415\t416\t417\t418\t419\t420\t421\t422\t423\t424\t425\t426\t427\t428\t429\t430\t431\t432\t433\t434\t435\t436\t437\t438\t439\t440\t441\t442\t443\t444\t445\t446\t447\t448\t449\t450\t451\t452\t453\t454\t455\t456\t457\t458\t459\t460\t461\t462\t463\t464\t465\t466\t467\t468\t469\t470\t471\t472\t473\t474\t475\t476\t477\t478\t479\t480\t481\t482\t483\t484\t485\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>481</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>482</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>483</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>484</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>485</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>485 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         html_snippets\n",
       "1    <div><label class=\"inner_link_article_item\"><s...\n",
       "2    <div><label class=\"inner_link_article_item\"><s...\n",
       "3    <div><label class=\"inner_link_article_item\"><s...\n",
       "4    <div><label class=\"inner_link_article_item\"><s...\n",
       "5    <div><label class=\"inner_link_article_item\"><s...\n",
       "..                                                 ...\n",
       "481  <div><label class=\"inner_link_article_item\"><s...\n",
       "482  <div><label class=\"inner_link_article_item\"><s...\n",
       "483  <div><label class=\"inner_link_article_item\"><s...\n",
       "484  <div><label class=\"inner_link_article_item\"><s...\n",
       "485  <div><label class=\"inner_link_article_item\"><s...\n",
       "\n",
       "[485 rows x 1 columns]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([html_raw]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stored 'html_raw' (dict)\n"
     ]
    }
   ],
   "source": [
    "%store html_raw\n",
    "import pickle \n",
    "filehandler = open(\"html_raw\", 'wb') \n",
    "pickle.dump(html_raw, filehandler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "11\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>481</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>482</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>483</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>484</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>485</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>474 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         html_snippets\n",
       "11   <div><label class=\"inner_link_article_item\"><s...\n",
       "12   <div><label class=\"inner_link_article_item\"><s...\n",
       "13   <div><label class=\"inner_link_article_item\"><s...\n",
       "14   <div><label class=\"inner_link_article_item\"><s...\n",
       "15   <div><label class=\"inner_link_article_item\"><s...\n",
       "..                                                 ...\n",
       "481  <div><label class=\"inner_link_article_item\"><s...\n",
       "482  <div><label class=\"inner_link_article_item\"><s...\n",
       "483  <div><label class=\"inner_link_article_item\"><s...\n",
       "484  <div><label class=\"inner_link_article_item\"><s...\n",
       "485  <div><label class=\"inner_link_article_item\"><s...\n",
       "\n",
       "[474 rows x 1 columns]"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_out = df[~df.duplicated()]\n",
    "print (len(df_out))\n",
    "df[df.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[11,\n",
       " 12,\n",
       " 13,\n",
       " 14,\n",
       " 15,\n",
       " 16,\n",
       " 17,\n",
       " 18,\n",
       " 19,\n",
       " 20,\n",
       " 21,\n",
       " 22,\n",
       " 23,\n",
       " 24,\n",
       " 25,\n",
       " 26,\n",
       " 27,\n",
       " 28,\n",
       " 29,\n",
       " 30,\n",
       " 31,\n",
       " 32,\n",
       " 33,\n",
       " 34,\n",
       " 35,\n",
       " 36,\n",
       " 37,\n",
       " 38,\n",
       " 39,\n",
       " 40,\n",
       " 41,\n",
       " 42,\n",
       " 43,\n",
       " 44,\n",
       " 45,\n",
       " 46,\n",
       " 47,\n",
       " 48,\n",
       " 49,\n",
       " 50,\n",
       " 51,\n",
       " 52,\n",
       " 53,\n",
       " 54,\n",
       " 55,\n",
       " 56,\n",
       " 57,\n",
       " 58,\n",
       " 59,\n",
       " 60,\n",
       " 61,\n",
       " 62,\n",
       " 63,\n",
       " 64,\n",
       " 65,\n",
       " 66,\n",
       " 67,\n",
       " 68,\n",
       " 69,\n",
       " 70,\n",
       " 71,\n",
       " 72,\n",
       " 73,\n",
       " 74,\n",
       " 75,\n",
       " 76,\n",
       " 77,\n",
       " 78,\n",
       " 79,\n",
       " 80,\n",
       " 81,\n",
       " 82,\n",
       " 83,\n",
       " 84,\n",
       " 85,\n",
       " 86,\n",
       " 87,\n",
       " 88,\n",
       " 89,\n",
       " 90,\n",
       " 91,\n",
       " 92,\n",
       " 93,\n",
       " 94,\n",
       " 95,\n",
       " 96,\n",
       " 97,\n",
       " 98,\n",
       " 99,\n",
       " 100,\n",
       " 101,\n",
       " 102,\n",
       " 103,\n",
       " 104,\n",
       " 105,\n",
       " 106,\n",
       " 107,\n",
       " 108,\n",
       " 109,\n",
       " 110,\n",
       " 111,\n",
       " 112,\n",
       " 113,\n",
       " 114,\n",
       " 115,\n",
       " 116,\n",
       " 118,\n",
       " 119,\n",
       " 120,\n",
       " 121,\n",
       " 122,\n",
       " 123,\n",
       " 124,\n",
       " 125,\n",
       " 126,\n",
       " 127,\n",
       " 128,\n",
       " 129,\n",
       " 130,\n",
       " 131,\n",
       " 132,\n",
       " 133,\n",
       " 134,\n",
       " 135,\n",
       " 136,\n",
       " 137,\n",
       " 138,\n",
       " 139,\n",
       " 140,\n",
       " 141,\n",
       " 142,\n",
       " 143,\n",
       " 144,\n",
       " 145,\n",
       " 146,\n",
       " 147,\n",
       " 148,\n",
       " 149,\n",
       " 150,\n",
       " 151,\n",
       " 152,\n",
       " 153,\n",
       " 154,\n",
       " 155,\n",
       " 156,\n",
       " 157,\n",
       " 158,\n",
       " 159,\n",
       " 160,\n",
       " 161,\n",
       " 162,\n",
       " 163,\n",
       " 164,\n",
       " 165,\n",
       " 166,\n",
       " 167,\n",
       " 168,\n",
       " 169,\n",
       " 170,\n",
       " 171,\n",
       " 172,\n",
       " 173,\n",
       " 174,\n",
       " 175,\n",
       " 176,\n",
       " 177,\n",
       " 178,\n",
       " 179,\n",
       " 180,\n",
       " 181,\n",
       " 182,\n",
       " 183,\n",
       " 184,\n",
       " 185,\n",
       " 186,\n",
       " 187,\n",
       " 188,\n",
       " 189,\n",
       " 190,\n",
       " 191,\n",
       " 192,\n",
       " 193,\n",
       " 194,\n",
       " 195,\n",
       " 196,\n",
       " 197,\n",
       " 198,\n",
       " 199,\n",
       " 200,\n",
       " 201,\n",
       " 202,\n",
       " 203,\n",
       " 204,\n",
       " 205,\n",
       " 206,\n",
       " 207,\n",
       " 208,\n",
       " 209,\n",
       " 210,\n",
       " 211,\n",
       " 212,\n",
       " 213,\n",
       " 214,\n",
       " 215,\n",
       " 216,\n",
       " 217,\n",
       " 218,\n",
       " 219,\n",
       " 220,\n",
       " 221,\n",
       " 222,\n",
       " 223,\n",
       " 224,\n",
       " 225,\n",
       " 226,\n",
       " 227,\n",
       " 228,\n",
       " 229,\n",
       " 230,\n",
       " 231,\n",
       " 232,\n",
       " 233,\n",
       " 234,\n",
       " 235,\n",
       " 236,\n",
       " 237,\n",
       " 238,\n",
       " 239,\n",
       " 240,\n",
       " 241,\n",
       " 242,\n",
       " 243,\n",
       " 244,\n",
       " 245,\n",
       " 246,\n",
       " 247,\n",
       " 248,\n",
       " 249,\n",
       " 250,\n",
       " 251,\n",
       " 252,\n",
       " 253,\n",
       " 254,\n",
       " 255,\n",
       " 256,\n",
       " 257,\n",
       " 258,\n",
       " 259,\n",
       " 260,\n",
       " 261,\n",
       " 262,\n",
       " 263,\n",
       " 264,\n",
       " 265,\n",
       " 266,\n",
       " 267,\n",
       " 268,\n",
       " 269,\n",
       " 270,\n",
       " 271,\n",
       " 272,\n",
       " 273,\n",
       " 274,\n",
       " 275,\n",
       " 276,\n",
       " 277,\n",
       " 278,\n",
       " 279,\n",
       " 280,\n",
       " 281,\n",
       " 282,\n",
       " 283,\n",
       " 284,\n",
       " 285,\n",
       " 286,\n",
       " 287,\n",
       " 288,\n",
       " 289,\n",
       " 290,\n",
       " 291,\n",
       " 292,\n",
       " 293,\n",
       " 294,\n",
       " 295,\n",
       " 296,\n",
       " 297,\n",
       " 298,\n",
       " 299,\n",
       " 300,\n",
       " 301,\n",
       " 302,\n",
       " 303,\n",
       " 304,\n",
       " 305,\n",
       " 306,\n",
       " 307,\n",
       " 308,\n",
       " 309,\n",
       " 310,\n",
       " 311,\n",
       " 312,\n",
       " 313,\n",
       " 314,\n",
       " 315,\n",
       " 316,\n",
       " 317,\n",
       " 318,\n",
       " 319,\n",
       " 320,\n",
       " 321,\n",
       " 322,\n",
       " 323,\n",
       " 324,\n",
       " 325,\n",
       " 326,\n",
       " 327,\n",
       " 328,\n",
       " 329,\n",
       " 330,\n",
       " 331,\n",
       " 332,\n",
       " 333,\n",
       " 334,\n",
       " 335,\n",
       " 336,\n",
       " 337,\n",
       " 338,\n",
       " 339,\n",
       " 340,\n",
       " 341,\n",
       " 342,\n",
       " 343,\n",
       " 344,\n",
       " 345,\n",
       " 346,\n",
       " 347,\n",
       " 348,\n",
       " 349,\n",
       " 350,\n",
       " 351,\n",
       " 352,\n",
       " 353,\n",
       " 354,\n",
       " 355,\n",
       " 356,\n",
       " 357,\n",
       " 358,\n",
       " 359,\n",
       " 360,\n",
       " 361,\n",
       " 362,\n",
       " 363,\n",
       " 364,\n",
       " 365,\n",
       " 366,\n",
       " 367,\n",
       " 368,\n",
       " 369,\n",
       " 370,\n",
       " 371,\n",
       " 372,\n",
       " 373,\n",
       " 374,\n",
       " 375,\n",
       " 376,\n",
       " 377,\n",
       " 378,\n",
       " 379,\n",
       " 380,\n",
       " 381,\n",
       " 382,\n",
       " 383,\n",
       " 384,\n",
       " 385,\n",
       " 386,\n",
       " 387,\n",
       " 388,\n",
       " 389,\n",
       " 390,\n",
       " 391,\n",
       " 392,\n",
       " 393,\n",
       " 394,\n",
       " 395,\n",
       " 396,\n",
       " 397,\n",
       " 398,\n",
       " 399,\n",
       " 400,\n",
       " 401,\n",
       " 402,\n",
       " 403,\n",
       " 404,\n",
       " 405,\n",
       " 406,\n",
       " 407,\n",
       " 408,\n",
       " 409,\n",
       " 410,\n",
       " 411,\n",
       " 412,\n",
       " 413,\n",
       " 414,\n",
       " 415,\n",
       " 416,\n",
       " 417,\n",
       " 418,\n",
       " 419,\n",
       " 420,\n",
       " 421,\n",
       " 422,\n",
       " 423,\n",
       " 424,\n",
       " 425,\n",
       " 426,\n",
       " 427,\n",
       " 428,\n",
       " 429,\n",
       " 430,\n",
       " 431,\n",
       " 432,\n",
       " 433,\n",
       " 434,\n",
       " 435,\n",
       " 436,\n",
       " 437,\n",
       " 438,\n",
       " 439,\n",
       " 440,\n",
       " 441,\n",
       " 442,\n",
       " 443,\n",
       " 444,\n",
       " 445,\n",
       " 446,\n",
       " 447,\n",
       " 448,\n",
       " 449,\n",
       " 450,\n",
       " 451,\n",
       " 452,\n",
       " 453,\n",
       " 454,\n",
       " 455,\n",
       " 456,\n",
       " 457,\n",
       " 458,\n",
       " 459,\n",
       " 460,\n",
       " 461,\n",
       " 462,\n",
       " 463,\n",
       " 464,\n",
       " 465,\n",
       " 466,\n",
       " 467,\n",
       " 468,\n",
       " 469,\n",
       " 470,\n",
       " 471,\n",
       " 472,\n",
       " 473,\n",
       " 474,\n",
       " 475,\n",
       " 476,\n",
       " 477,\n",
       " 478,\n",
       " 479,\n",
       " 480,\n",
       " 481,\n",
       " 482,\n",
       " 483,\n",
       " 484,\n",
       " 485]"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "try_again = list(df[df.duplicated()].index)\n",
    "print(try_again)\n",
    "try_again = try_again + list (set(pages).difference(set(df.index.values)))\n",
    "try_again"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 暂存档"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = fn [\"output\"] [\"公众号_htm_snippets\"] \n",
    "df_out.to_csv(filename.format(公众号=公众号), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "15,17,15,15,15,15,14,16,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,"
     ]
    }
   ],
   "source": [
    "def parse_html_snippets(_snippet_):\n",
    "    root = fromstring(_snippet_) \n",
    "    title = [x.text for x in root.xpath('//div[@class=\"inner_link_article_title\"]')]\n",
    "    create_time = [x.text for x in root.xpath('//div[@class=\"inner_link_article_date\"]')]\n",
    "    link = [x for x in root.xpath('//a/@href')]\n",
    "    _df_ = pd.DataFrame({\"title\":title, \"create_time\": create_time, \"link\":link})\n",
    "    return(_df_)\n",
    "    \n",
    "l_df = []\n",
    "for p in pages:\n",
    "    _df_ = parse_html_snippets(df.loc[p,\"html_snippets\"])\n",
    "    print (len(_df_), end=\",\")\n",
    "    l_df.append(_df_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2蚊鸡！坐上广州这路“美食公交”线，从东山吃到西关！</td>\n",
       "      <td>2020-05-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>不走回头路，把广州的森系小店统统逛遍！</td>\n",
       "      <td>2020-05-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>意难平！陪你度过童年的20种零食，没有陪你一起长大…</td>\n",
       "      <td>2020-05-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>奈雪的荔枝季来了，霸气荔枝+气泡茶双双登场！</td>\n",
       "      <td>2020-05-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>就在明天！人气餐饮1元秒杀、1折开吃...517吃货节又来了！</td>\n",
       "      <td>2020-05-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>横扫夏日炎热！这8道开胃菜，怎么吃都不腻！爱了~</td>\n",
       "      <td>2020-05-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>¥99换眼镜！墨镜买一送一！害，近视党又逃不出“眼镜王国”的掌心了！</td>\n",
       "      <td>2020-05-15</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>真香！贵州来的铁板烙锅，在广州很难找到第二家！</td>\n",
       "      <td>2020-05-15</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>体育西这家小蓝屋，里面全是“小宝物”，一天只开4小时！</td>\n",
       "      <td>2020-05-15</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>江博士清仓活动：全场货品69元-169元！</td>\n",
       "      <td>2020-05-14</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>这些老板很凶的店，来不来随你！</td>\n",
       "      <td>2020-05-14</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 title create_time  \\\n",
       "0           2蚊鸡！坐上广州这路“美食公交”线，从东山吃到西关！  2020-05-17   \n",
       "1                  不走回头路，把广州的森系小店统统逛遍！  2020-05-17   \n",
       "2           意难平！陪你度过童年的20种零食，没有陪你一起长大…  2020-05-17   \n",
       "3               奈雪的荔枝季来了，霸气荔枝+气泡茶双双登场！  2020-05-16   \n",
       "4      就在明天！人气餐饮1元秒杀、1折开吃...517吃货节又来了！  2020-05-16   \n",
       "5             横扫夏日炎热！这8道开胃菜，怎么吃都不腻！爱了~  2020-05-16   \n",
       "6   ¥99换眼镜！墨镜买一送一！害，近视党又逃不出“眼镜王国”的掌心了！  2020-05-15   \n",
       "7              真香！贵州来的铁板烙锅，在广州很难找到第二家！  2020-05-15   \n",
       "8          体育西这家小蓝屋，里面全是“小宝物”，一天只开4小时！  2020-05-15   \n",
       "9                江博士清仓活动：全场货品69元-169元！  2020-05-14   \n",
       "10                     这些老板很凶的店，来不来随你！  2020-05-14   \n",
       "\n",
       "                                                 link  \n",
       "0   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "1   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "2   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "3   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "4   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "5   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "6   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "7   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "8   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "9   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "10  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  "
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_url_out.loc[0:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>7272</th>\n",
       "      <td>江南西，熟悉的宝藏小店一家没少，都开好了！</td>\n",
       "      <td>2020-03-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7273</th>\n",
       "      <td>3月不减肥，接下来都徒伤悲！这份减脂食谱，你必须拥有！</td>\n",
       "      <td>2020-03-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7274</th>\n",
       "      <td>@广州所有白衣英雄，整整6个月的福利等你查收！</td>\n",
       "      <td>2020-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7275</th>\n",
       "      <td>想吃炸鸡的第89天，我已经忍不住了！</td>\n",
       "      <td>2020-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7276</th>\n",
       "      <td>我劝你“不要和广东人吵架”！因为真的会越骂越饿哈哈哈！</td>\n",
       "      <td>2020-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            title create_time  \\\n",
       "7272        江南西，熟悉的宝藏小店一家没少，都开好了！  2020-03-28   \n",
       "7273  3月不减肥，接下来都徒伤悲！这份减脂食谱，你必须拥有！  2020-03-28   \n",
       "7274      @广州所有白衣英雄，整整6个月的福利等你查收！  2020-03-27   \n",
       "7275           想吃炸鸡的第89天，我已经忍不住了！  2020-03-27   \n",
       "7276  我劝你“不要和广东人吵架”！因为真的会越骂越饿哈哈哈！  2020-03-27   \n",
       "\n",
       "                                                   link  \n",
       "7272  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "7273  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "7274  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "7275  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "7276  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  "
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out.tail(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>value</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>不走回头路，把广州的森系小店统统逛遍！</td>\n",
       "      <td>2020-05-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>意难平！陪你度过童年的20种零食，没有陪你一起长大…</td>\n",
       "      <td>2020-05-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>奈雪的荔枝季来了，霸气荔枝+气泡茶双双登场！</td>\n",
       "      <td>2020-05-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>就在明天！人气餐饮1元秒杀、1折开吃...517吃货节又来了！</td>\n",
       "      <td>2020-05-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>横扫夏日炎热！这8道开胃菜，怎么吃都不腻！爱了~</td>\n",
       "      <td>2020-05-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7272</th>\n",
       "      <td>江南西，熟悉的宝藏小店一家没少，都开好了！</td>\n",
       "      <td>2020-03-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7273</th>\n",
       "      <td>3月不减肥，接下来都徒伤悲！这份减脂食谱，你必须拥有！</td>\n",
       "      <td>2020-03-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7274</th>\n",
       "      <td>@广州所有白衣英雄，整整6个月的福利等你查收！</td>\n",
       "      <td>2020-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7275</th>\n",
       "      <td>想吃炸鸡的第89天，我已经忍不住了！</td>\n",
       "      <td>2020-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7276</th>\n",
       "      <td>我劝你“不要和广东人吵架”！因为真的会越骂越饿哈哈哈！</td>\n",
       "      <td>2020-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6792 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 title create_time  \\\n",
       "value                                                \n",
       "1                  不走回头路，把广州的森系小店统统逛遍！  2020-05-17   \n",
       "2           意难平！陪你度过童年的20种零食，没有陪你一起长大…  2020-05-17   \n",
       "3               奈雪的荔枝季来了，霸气荔枝+气泡茶双双登场！  2020-05-16   \n",
       "4      就在明天！人气餐饮1元秒杀、1折开吃...517吃货节又来了！  2020-05-16   \n",
       "5             横扫夏日炎热！这8道开胃菜，怎么吃都不腻！爱了~  2020-05-16   \n",
       "...                                ...         ...   \n",
       "7272             江南西，熟悉的宝藏小店一家没少，都开好了！  2020-03-28   \n",
       "7273       3月不减肥，接下来都徒伤悲！这份减脂食谱，你必须拥有！  2020-03-28   \n",
       "7274           @广州所有白衣英雄，整整6个月的福利等你查收！  2020-03-27   \n",
       "7275                想吃炸鸡的第89天，我已经忍不住了！  2020-03-27   \n",
       "7276       我劝你“不要和广东人吵架”！因为真的会越骂越饿哈哈哈！  2020-03-27   \n",
       "\n",
       "                                                    link  \n",
       "value                                                     \n",
       "1      http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "2      http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "3      http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "4      http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "5      http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "...                                                  ...  \n",
       "7272   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "7273   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "7274   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "7275   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "7276   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "\n",
       "[6792 rows x 3 columns]"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# tagging 标记\n",
    "tagging_list = [\"\",\"马化腾\", \"腾讯\", \"微众银行\",\"腾讯复星\",\"腾讯风控\",\"腾讯支付\",\"WeChat\",\"We Remit\",\\\n",
    "                \"公益日\",\"红包\",\\\n",
    "                \"腾讯财付通\",\"鹅厂\",\"QQ钱包\",\"QQ红包\",\"QQ\",\\\n",
    "                \"只有一\",\"大咖\",\"听说\",\"图片\",\"照片\",\"小编\",\\\n",
    "                \"洗钱\", \"黑产\",\"被骗\",\"腾讯安全课\",\"诈骗\", \"炒股\",\"神秘兼职\",\"神秘组织\",\\\n",
    "                \"财付通\", \"品牌\",\\\n",
    "                \"收款\",\\\n",
    "                \"报告\",\\\n",
    "                \"银行卡\",\"理财\",\"选股\",\"发票\",\"基金\",\\\n",
    "                \"区块链\",\"金融云\",\"O2O\",\"农产品\",\"家乡\",\\\n",
    "                \"数据\", \"数据赋能\", \"智能\", \"数字孪生\", \"智慧大脑\",\\\n",
    "                \"出行\",\"乘车\",\"公交\",\"乘车码\", \"智慧地铁\",\\\n",
    "                \"高峰论坛\", \"智库\",\\\n",
    "                \"央行\",\"新规\", \\\n",
    "                \"微信\", \"微信支付\", \"跨境支付\", \"移动支付\",\"非银行支付\",\"电子支付\",\\\n",
    "                \"互联网金融\", \"金融科技\",\"互联网＋\",\"互联网+金融\",\"普惠金融\",\"虚拟银行\",\\\n",
    "                \"开放\",\"生态\",\"复杂\",\"互联网思维\",\"全球合作伙伴\",\\\n",
    "                \"联合国\", \"城市\", \"粤港澳大湾区\", \"平台\", \"可持续发展\", \"未来\", \"绿色\",\\\n",
    "                \"医护\",\"防护服\",\"小时\",\"武汉\",\"危机\",\"新冠肺炎\", \"疫\", \"疫情\", \"复工\",\"停课\",\"宅经济\",\\\n",
    "                \"基建\",\"新基建\"] #overwritable\n",
    "\n",
    "v_v_list = []\n",
    "\n",
    "for tag in tagging_list:\n",
    "    index_list = df_url_out [ df_url_out.title.str.contains(tag) ].index.tolist()\n",
    "    v_v_pairs = pd.DataFrame({tag:index_list}).melt().set_index(\"value\")\n",
    "    v_v_list.append(v_v_pairs)\n",
    "\n",
    "df_cat = v_v_list[0]\n",
    "for d in v_v_list:\n",
    "    df_cat.update(d)\n",
    "    \n",
    "# 尚未标记内容\n",
    "df_url_out.loc [ df_cat.query('variable==\"\"').index ]"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "df_url_out.loc[53].link"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>152</th>\n",
       "      <td>900w名“广漂”：复工第①个月，你收到工资了吗？</td>\n",
       "      <td>2020-03-31</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>153</th>\n",
       "      <td>祖传的广式苦味下午茶，让奶茶都失了宠！</td>\n",
       "      <td>2020-03-31</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>154</th>\n",
       "      <td>去一趟公园前，我要先饿足三天三夜！</td>\n",
       "      <td>2020-03-31</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>155</th>\n",
       "      <td>体育西，复苏ing！</td>\n",
       "      <td>2020-03-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>156</th>\n",
       "      <td>今天我们不讲故事，只听歌！</td>\n",
       "      <td>2020-03-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7272</th>\n",
       "      <td>江南西，熟悉的宝藏小店一家没少，都开好了！</td>\n",
       "      <td>2020-03-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7273</th>\n",
       "      <td>3月不减肥，接下来都徒伤悲！这份减脂食谱，你必须拥有！</td>\n",
       "      <td>2020-03-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7274</th>\n",
       "      <td>@广州所有白衣英雄，整整6个月的福利等你查收！</td>\n",
       "      <td>2020-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7275</th>\n",
       "      <td>想吃炸鸡的第89天，我已经忍不住了！</td>\n",
       "      <td>2020-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7276</th>\n",
       "      <td>我劝你“不要和广东人吵架”！因为真的会越骂越饿哈哈哈！</td>\n",
       "      <td>2020-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>7124 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                            title create_time  \\\n",
       "152     900w名“广漂”：复工第①个月，你收到工资了吗？  2020-03-31   \n",
       "153           祖传的广式苦味下午茶，让奶茶都失了宠！  2020-03-31   \n",
       "154             去一趟公园前，我要先饿足三天三夜！  2020-03-31   \n",
       "155                    体育西，复苏ing！  2020-03-30   \n",
       "156                 今天我们不讲故事，只听歌！  2020-03-30   \n",
       "...                           ...         ...   \n",
       "7272        江南西，熟悉的宝藏小店一家没少，都开好了！  2020-03-28   \n",
       "7273  3月不减肥，接下来都徒伤悲！这份减脂食谱，你必须拥有！  2020-03-28   \n",
       "7274      @广州所有白衣英雄，整整6个月的福利等你查收！  2020-03-27   \n",
       "7275           想吃炸鸡的第89天，我已经忍不住了！  2020-03-27   \n",
       "7276  我劝你“不要和广东人吵架”！因为真的会越骂越饿哈哈哈！  2020-03-27   \n",
       "\n",
       "                                                   link  \n",
       "152   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "153   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "154   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "155   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "156   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "...                                                 ...  \n",
       "7272  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "7273  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "7274  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "7275  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "7276  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "\n",
       "[7124 rows x 3 columns]"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out[df_url_out.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2蚊鸡！坐上广州这路“美食公交”线，从东山吃到西关！</td>\n",
       "      <td>2020-05-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>不走回头路，把广州的森系小店统统逛遍！</td>\n",
       "      <td>2020-05-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>意难平！陪你度过童年的20种零食，没有陪你一起长大…</td>\n",
       "      <td>2020-05-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>奈雪的荔枝季来了，霸气荔枝+气泡茶双双登场！</td>\n",
       "      <td>2020-05-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>就在明天！人气餐饮1元秒杀、1折开吃...517吃货节又来了！</td>\n",
       "      <td>2020-05-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>148</th>\n",
       "      <td>3月不减肥，接下来都徒伤悲！这份减脂食谱，你必须拥有！</td>\n",
       "      <td>2020-03-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149</th>\n",
       "      <td>@广州所有白衣英雄，整整6个月的福利等你查收！</td>\n",
       "      <td>2020-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>150</th>\n",
       "      <td>想吃炸鸡的第89天，我已经忍不住了！</td>\n",
       "      <td>2020-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>151</th>\n",
       "      <td>我劝你“不要和广东人吵架”！因为真的会越骂越饿哈哈哈！</td>\n",
       "      <td>2020-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1750</th>\n",
       "      <td>跟着何炅、王一博...宅家做饭！超简单竟然还有点好吃！</td>\n",
       "      <td>2020-03-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>153 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                title create_time  \\\n",
       "0          2蚊鸡！坐上广州这路“美食公交”线，从东山吃到西关！  2020-05-17   \n",
       "1                 不走回头路，把广州的森系小店统统逛遍！  2020-05-17   \n",
       "2          意难平！陪你度过童年的20种零食，没有陪你一起长大…  2020-05-17   \n",
       "3              奈雪的荔枝季来了，霸气荔枝+气泡茶双双登场！  2020-05-16   \n",
       "4     就在明天！人气餐饮1元秒杀、1折开吃...517吃货节又来了！  2020-05-16   \n",
       "...                               ...         ...   \n",
       "148       3月不减肥，接下来都徒伤悲！这份减脂食谱，你必须拥有！  2020-03-28   \n",
       "149           @广州所有白衣英雄，整整6个月的福利等你查收！  2020-03-27   \n",
       "150                想吃炸鸡的第89天，我已经忍不住了！  2020-03-27   \n",
       "151       我劝你“不要和广东人吵架”！因为真的会越骂越饿哈哈哈！  2020-03-27   \n",
       "1750      跟着何炅、王一博...宅家做饭！超简单竟然还有点好吃！  2020-03-29   \n",
       "\n",
       "                                                   link  \n",
       "0     http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "1     http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "2     http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "3     http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "4     http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "...                                                 ...  \n",
       "148   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "149   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "150   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "151   http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "1750  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...  \n",
       "\n",
       "[153 rows x 3 columns]"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out[~df_url_out.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>variable</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2蚊鸡！坐上广州这路“美食公交”线，从东山吃到西关！</td>\n",
       "      <td>2020-05-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>公交</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>不走回头路，把广州的森系小店统统逛遍！</td>\n",
       "      <td>2020-05-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>意难平！陪你度过童年的20种零食，没有陪你一起长大…</td>\n",
       "      <td>2020-05-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>奈雪的荔枝季来了，霸气荔枝+气泡茶双双登场！</td>\n",
       "      <td>2020-05-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>就在明天！人气餐饮1元秒杀、1折开吃...517吃货节又来了！</td>\n",
       "      <td>2020-05-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7272</th>\n",
       "      <td>江南西，熟悉的宝藏小店一家没少，都开好了！</td>\n",
       "      <td>2020-03-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7273</th>\n",
       "      <td>3月不减肥，接下来都徒伤悲！这份减脂食谱，你必须拥有！</td>\n",
       "      <td>2020-03-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7274</th>\n",
       "      <td>@广州所有白衣英雄，整整6个月的福利等你查收！</td>\n",
       "      <td>2020-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7275</th>\n",
       "      <td>想吃炸鸡的第89天，我已经忍不住了！</td>\n",
       "      <td>2020-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7276</th>\n",
       "      <td>我劝你“不要和广东人吵架”！因为真的会越骂越饿哈哈哈！</td>\n",
       "      <td>2020-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>7277 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                title create_time  \\\n",
       "0          2蚊鸡！坐上广州这路“美食公交”线，从东山吃到西关！  2020-05-17   \n",
       "1                 不走回头路，把广州的森系小店统统逛遍！  2020-05-17   \n",
       "2          意难平！陪你度过童年的20种零食，没有陪你一起长大…  2020-05-17   \n",
       "3              奈雪的荔枝季来了，霸气荔枝+气泡茶双双登场！  2020-05-16   \n",
       "4     就在明天！人气餐饮1元秒杀、1折开吃...517吃货节又来了！  2020-05-16   \n",
       "...                               ...         ...   \n",
       "7272            江南西，熟悉的宝藏小店一家没少，都开好了！  2020-03-28   \n",
       "7273      3月不减肥，接下来都徒伤悲！这份减脂食谱，你必须拥有！  2020-03-28   \n",
       "7274          @广州所有白衣英雄，整整6个月的福利等你查收！  2020-03-27   \n",
       "7275               想吃炸鸡的第89天，我已经忍不住了！  2020-03-27   \n",
       "7276      我劝你“不要和广东人吵架”！因为真的会越骂越饿哈哈哈！  2020-03-27   \n",
       "\n",
       "                                                   link variable  \n",
       "0     http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...       公交  \n",
       "1     http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...     无法分类  \n",
       "2     http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...     无法分类  \n",
       "3     http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...     无法分类  \n",
       "4     http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...     无法分类  \n",
       "...                                                 ...      ...  \n",
       "7272  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...     无法分类  \n",
       "7273  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...     无法分类  \n",
       "7274  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...     无法分类  \n",
       "7275  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...     无法分类  \n",
       "7276  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...     无法分类  \n",
       "\n",
       "[7277 rows x 4 columns]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_o = df_url_out.join(df_cat).replace(\"\", np.nan).fillna(\"无法分类\")\n",
    "df_o"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>variable</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2蚊鸡！坐上广州这路“美食公交”线，从东山吃到西关！</td>\n",
       "      <td>2020-05-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>公交</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>就在明天！人气餐饮1元秒杀、1折开吃...517吃货节又来了！</td>\n",
       "      <td>2020-05-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>横扫夏日炎热！这8道开胃菜，怎么吃都不腻！爱了~</td>\n",
       "      <td>2020-05-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>在广州的外省人应该怎么吃？</td>\n",
       "      <td>2020-05-04</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>广州云道攻略更新！暴走4小时，连吃喝指南都整理好了！</td>\n",
       "      <td>2020-05-03</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>小时</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7255</th>\n",
       "      <td>跟着何炅、王一博...宅家做饭！超简单竟然还有点好吃！</td>\n",
       "      <td>2020-03-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7260</th>\n",
       "      <td>想吃炸鸡的第89天，我已经忍不住了！</td>\n",
       "      <td>2020-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7267</th>\n",
       "      <td>变着花样吃蛋！这10款神奇的鸡蛋菜谱，男女老少都爱！</td>\n",
       "      <td>2020-03-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7270</th>\n",
       "      <td>跟着何炅、王一博...宅家做饭！超简单竟然还有点好吃！</td>\n",
       "      <td>2020-03-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7275</th>\n",
       "      <td>想吃炸鸡的第89天，我已经忍不住了！</td>\n",
       "      <td>2020-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1452 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                title create_time  \\\n",
       "0          2蚊鸡！坐上广州这路“美食公交”线，从东山吃到西关！  2020-05-17   \n",
       "4     就在明天！人气餐饮1元秒杀、1折开吃...517吃货节又来了！  2020-05-16   \n",
       "5            横扫夏日炎热！这8道开胃菜，怎么吃都不腻！爱了~  2020-05-16   \n",
       "42                      在广州的外省人应该怎么吃？  2020-05-04   \n",
       "44         广州云道攻略更新！暴走4小时，连吃喝指南都整理好了！  2020-05-03   \n",
       "...                               ...         ...   \n",
       "7255      跟着何炅、王一博...宅家做饭！超简单竟然还有点好吃！  2020-03-29   \n",
       "7260               想吃炸鸡的第89天，我已经忍不住了！  2020-03-27   \n",
       "7267       变着花样吃蛋！这10款神奇的鸡蛋菜谱，男女老少都爱！  2020-03-30   \n",
       "7270      跟着何炅、王一博...宅家做饭！超简单竟然还有点好吃！  2020-03-29   \n",
       "7275               想吃炸鸡的第89天，我已经忍不住了！  2020-03-27   \n",
       "\n",
       "                                                   link variable  \n",
       "0     http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...       公交  \n",
       "4     http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...     无法分类  \n",
       "5     http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...     无法分类  \n",
       "42    http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...     无法分类  \n",
       "44    http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...       小时  \n",
       "...                                                 ...      ...  \n",
       "7255  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...     无法分类  \n",
       "7260  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...     无法分类  \n",
       "7267  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...     无法分类  \n",
       "7270  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...     无法分类  \n",
       "7275  http://mp.weixin.qq.com/s?__biz=MjM5MjMzMjY2MA...     无法分类  \n",
       "\n",
       "[1452 rows x 4 columns]"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_o[df_o.title.str.contains(\"吃\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>variable</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>无法分类</th>\n",
       "      <td>6792</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>复工</th>\n",
       "      <td>476</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>小时</th>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>品牌</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>公交</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>听说</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>开放</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>未来</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          title\n",
       "variable       \n",
       "无法分类       6792\n",
       "复工          476\n",
       "小时            3\n",
       "品牌            2\n",
       "公交            1\n",
       "听说            1\n",
       "开放            1\n",
       "未来            1"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_stats = df_o.groupby(by=\"variable\").agg({\"title\":\"count\"}).sort_values(by=\"title\", ascending=False)\n",
    "df_stats"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 输出"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_account.columns.name = \"rel_accounts\"\n",
    "df_o.columns.name = \"url_cat\"\n",
    "df_stats.columns.name = \"stats\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "_df_.columns.name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the xlsxwriter workbook and worksheet objects.  \n",
    "with pd.ExcelWriter(fn[\"output\"][\"公众号_xlsx\"].format(公众号=公众号)) as writer:\n",
    "    workbook  = writer.book\n",
    "\n",
    "    for _df_ in [df_account, df_o, df_stats]:\n",
    "        _df_.to_excel(writer, sheet_name = _df_.columns.name)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
